1 Load libraries

library(tidyverse)          
library(clusterProfiler)    
library(enrichplot)         
library(org.Hs.eg.db)  
library(Cairo) 
library(VennDiagram)       
library(grid)               
library(simplifyEnrichment)
library(readxl)

2 Define functions

# map_cluster_number: 
#   - x: enrichResult or compareClusterResult (from clusterProfiler)
#   - df: simplifyGO result data.frame with columns ID (term IDs) and Cluster_num
#   - comp: if TRUE, use x@compareClusterResult, else x@result
# Returns a data.frame of all enrich columns + term_size + Cluster_num
map_cluster_number <- function(x,
                               df,
                               comp = FALSE) {
  ## 1. Standardize the input DF’s column names
  colnames(df) <- c("ID", "Cluster_num")

  ## 2. Select the correct slot from the clusterProfiler object
  if (comp) {
    # from compareClusterResult
    results <- x@compareClusterResult
  } else {
    # from a single enrichResult
    results <- x@result
  }

  ## 3. Get term_size (the numerator of the "BgRatio" string)
  ##    e.g. "12/21273" → 12
  results$term_size <- as.numeric(
    sapply(strsplit(results$BgRatio, "/"),
           function(parts) as.numeric(parts[1]))
  )

  ## 4. Merge the enrichment results with the cluster assignments
  merged_results <- merge(
    results,
    df,
    by    = "ID",
    all.x = TRUE
  )

  ## 5. Return the processed data frame
  return(merged_results)
}


# Function to map Ensembl IDs to gene symbols for a single row
map_geneID_to_symbol <- function(geneID_str) {
  geneIDs <- unlist(strsplit(geneID_str, "/"))
  mapped_ids <- bitr(geneIDs, 
                     fromType = "ENSEMBL", 
                     toType = "SYMBOL", 
                     OrgDb = org.Hs.eg.db)
  
  # drop duplicates in ensembl column and keep first occurence
  mapped_ids <- mapped_ids[!duplicated(mapped_ids$ENSEMBL),]
  
  gene_symbols <- mapped_ids$SYMBOL
  return(paste(gene_symbols, collapse = "/"))
}


# Function to map Ensembl IDs to gene symbols for a single row
map_geneID_to_name <- function(geneIDs) {
  mapped_ids <- bitr(geneIDs, 
                     fromType = "ENSEMBL", 
                     toType = c("GENENAME", "SYMBOL"), 
                     OrgDb = org.Hs.eg.db)
  
  # drop duplicates in ensembl column and keep first occurence
  mapped_ids <- mapped_ids[!duplicated(mapped_ids$ENSEMBL),]
  
  return(mapped_ids)
}

3 Data loading and preprocessing

3.1 Set input and output paths

# set input and output paths
in_path <- "/mnt/Data/Projects/Cilia/revision/NonRestricted/data/"
out_path <- "/mnt/Data/Projects/Cilia/revision/NonRestricted/analysis/GO_BP/"

3.2 Load data

# Load the data
df_all <- read.delim(paste0(in_path, "All_files_combined_as_cytoscape_input.csv"), sep = "\t", header = TRUE, row.names = 1)
head(df_all)

3.3 Map gene names and symbols

# Identify columns that contain "num" in their names
num_columns <- grep("num", names(df_all), value = TRUE)

# Convert all other columns from string to logical
df_all <- df_all %>%
  mutate(across(-all_of(num_columns), ~ as.logical(.)))

# map gene IDs to gene names 
mapped_ids <- map_geneID_to_name(rownames(df_all))

# index by Ensembl
rownames(mapped_ids) <- mapped_ids$ENSEMBL

# pull out exactly one entry per row of df_all
df_all$GeneSymbol <- mapped_ids[ rownames(df_all), "SYMBOL"   ]
df_all$GeneName   <- mapped_ids[ rownames(df_all), "GENENAME" ]

# add rownames as column and reset index
df_all$Ensembl_ID <- rownames(df_all)
rownames(df_all) <- NULL

# reorder columns to have Ensembl_ID first, Symbol and Gene name first and then all other columns
df_all <- df_all %>%  dplyr::select(Ensembl_ID, GeneSymbol, GeneName, everything())

3.4 Split data by location

# Perform filtering again
df_bb <- df_all %>% filter(BasalBody_ == TRUE)
df_pc <- df_all %>% filter(PrimaryCilia_ == TRUE)
df_tip <- df_all %>% filter(PrimaryCiliaTip_ == TRUE)
df_tz <- df_all %>% filter(PrimaryCiliaTZ_ == TRUE)

# filter gene_id by location
gene_id_all <- df_all$Ensembl_ID
gene_id_bb <- df_bb$Ensembl_ID
gene_id_pc <- df_pc$Ensembl_ID
gene_id_tip <- df_tip$Ensembl_ID
gene_id_tz <- df_tz$Ensembl_ID

3.5 Combine bb & tz and pc & tip

# combine pc and tip and tz
df_pc_tip_tz <- rbind(df_pc, df_tip, df_tz)

# drop duplicates based on Ensembl_ID column
df_pc_tip_tz <- df_pc_tip_tz[!duplicated(df_pc_tip_tz$Ensembl_ID), ]
gene_id_pc_tip_tz <- df_pc_tip_tz$Ensembl_ID

4 Compare biological themes for different locations with variable proteins

4.1 Define variable genes

# Load the data from the Excel file
df <- read_excel("/mnt/Data/Projects/Cilia/revision/Filtered_Staining_List_combined_exploded.xlsx")

# Identify Ensembl IDs for each category by searching substrings in the "Annotation (Intensity)" column
tip_variable <- unique(df$`Ensembl id`[grepl("Primary cilium tip", df$`Annotation (Intensity)`)])
pc_variable  <- unique(df$`Ensembl id`[grepl("Primary cilium",     df$`Annotation (Intensity)`)])
tz_variable  <- unique(df$`Ensembl id`[grepl("Primary cilium transition zone", df$`Annotation (Intensity)`)])

# Combine all variable genes and remove duplicates
variable_genes <- unique(c(pc_variable, tip_variable, tz_variable))

# Identify any non-variable genes
non_variable <- setdiff(gene_id_pc_tip_tz, variable_genes)

4.2 Map as columns to df_all and save as csv file

# add "Intensity variation" columns to df_all with true if in variable genes, false if in non_variable and NA otherwise
df_all <- df_all %>%
  mutate(
    Intensity_variation = case_when(
      Ensembl_ID %in% variable_genes ~ TRUE,
      Ensembl_ID %in% non_variable   ~ FALSE,
      TRUE                            ~ NA   # catch-all case for any other Ensembl IDs
    )
  )

# write to file
write.csv(df_all, file = paste0(out_path, "NonRestricted_all_proteins_enrichment_input_mapping.csv"))

4.3 GO BP enrichment analysis

# Split data by location
input_genes <- list(
  non_variable = non_variable,
  variable = variable_genes
)

# check length of each list
lapply(input_genes, length)
## $non_variable
## [1] 109
## 
## $variable
## [1] 409
comp <- compareCluster(geneCluster = input_genes,
                     fun = "enrichGO",
                     OrgDb = org.Hs.eg.db,
                     keyType       = 'ENSEMBL',
                     ont           = "BP",
                     pAdjustMethod = "BH",
                     pvalueCutoff  = 0.01,
                     qvalueCutoff  = 0.01)

4.3.1 Dot plot of all enriched terms

# plot dotplot
dotplot(comp, showCategory = NULL) + ggtitle("GO BP Enrichment Comparison", subtitle = "qvalue < 0.01") +
  scale_y_discrete(labels = function(x) str_wrap(x, width = 100))

# save dotplot as svg file
ggsave(paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot.svg"), plot = last_plot(), device = "svg", width = 14, height = 50, limitsize = FALSE)

4.3.2 Visualize overlap of enriched terms

# extract results
results <- comp@compareClusterResult

# split by location
variable <- results[results$Cluster == "variable",]
non_variable <- results[results$Cluster == "non_variable",]

# Create a list of sets
go_lists <- list(
  variable = variable$ID,
  non_variable = non_variable$ID
)

# Plot the Venn diagram
venn.plot <- venn.diagram(
  x = go_lists,
  category.names = c("variable", "non_variable"),
  filename = NULL,
  output = TRUE
)

grid.newpage()
grid.draw(venn.plot)

# Save the captured plot as an SVG file
svg(paste0(out_path, "NonRestricted_comparison_variable_GO_BP_venn.svg"), width = 6, height = 6)
grid.draw(venn.plot)
dev.off()
## svg 
##   2

4.3.3 Filter terms only enriched in one condition

# calculate intersection of the two
unspecific_terms <- intersect(variable$ID, non_variable$ID)

# remove all unspecific terms
specific_terms <- results %>% filter(!ID %in% unspecific_terms)

# create a copy of comp
comp_filtered <- comp

# update results in comp
comp_filtered@compareClusterResult <- specific_terms

4.3.4 Dot plot of uniquely enriched terms

# plot dotplot
dotplot(comp_filtered, showCategory = NULL) + ggtitle("GO BP Enrichment Comparison", subtitle = "qvalue < 0.01")  +
  scale_y_discrete(labels = function(x) str_wrap(x, width = 100))

# save dotplot as svg file
ggsave(paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_specific_terms_only.svg"), plot = last_plot(), device = "svg", width = 14, height = 50, limitsize = FALSE)
# Subset for pc_tip
variable <- comp_filtered@compareClusterResult[
  comp_filtered@compareClusterResult$Cluster == "variable", 
]

# Subset for bb_tz
non_variable <- comp_filtered@compareClusterResult[
  comp_filtered@compareClusterResult$Cluster == "non_variable", 
]

# Create new compareClusterResult objects for each subset
comp_filtered_variable <- comp_filtered
comp_filtered_non_variable <- comp_filtered

comp_filtered_variable@compareClusterResult <- variable
comp_filtered_non_variable@compareClusterResult <- non_variable

4.3.5 Cluster results - variable proteins

go_id = comp_filtered_variable@compareClusterResult$ID
mat = GO_similarity(go_id, 
                    ont = 'BP', 
                    db = 'org.Hs.eg.db', 
                    measure = "Sim_Relevance_2006"
                    )

4.3.5.1 Plot cluster heatmap

# Capture the plot
heatmap_plot <- grid.grabExpr({
 df <- simplifyGO(mat,
             method = 'binary_cut',
             plot = TRUE,
             column_title = "GO BP terms only significant in variable proteins",
             use_raster = FALSE,
             order_by_size = TRUE,
             fontsize_range = c(9, 18),
             max_words = 6,
             word_cloud_grob_param = list(col = 'black', 
                                          max_width = unit(200, "mm")))
})

# Save the captured plot as an SVG file
svg(paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_specific_terms_only_variable_heatmap.svg"), width = 16, height = 9)
grid.draw(heatmap_plot)
dev.off()
## pdf 
##   2
grid.newpage()
grid.draw(heatmap_plot)

4.3.5.2 Process and save results

# add cluster number from GO term clustering
results <- map_cluster_number(comp_filtered_variable,
                              df = df,
                              comp = TRUE
)

# Apply the function to each row of the DataFrame
results$GeneSymbol <- sapply(results$geneID, map_geneID_to_symbol)

# save results as csv file
write.csv(results, file = paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_specific_terms_only_variable_result.csv"))

4.3.6 Cluster results - non-variable proteins

go_id = comp_filtered_non_variable@compareClusterResult$ID
mat = GO_similarity(go_id, 
                    ont = 'BP', 
                    db = 'org.Hs.eg.db', 
                    measure = "Sim_Relevance_2006"
                    )

4.3.6.1 Plot cluster heatmap

# Capture the plot
heatmap_plot <- grid.grabExpr({
 df <- simplifyGO(mat,
             method = 'binary_cut',
             plot = TRUE,
             column_title = "GO BP terms only significant in non-variable proteins",
             use_raster = FALSE,
             order_by_size = TRUE,
             fontsize_range = c(18, 36),
             max_words = 6,
             word_cloud_grob_param = list(col = 'black', 
                                          max_width = unit(200, "mm")))
})

# Save the captured plot as an SVG file
svg(paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_specific_terms_only_nonvariable_heatmap.svg"), width = 16, height = 9)
grid.draw(heatmap_plot)
dev.off()
## pdf 
##   2
grid.newpage()
grid.draw(heatmap_plot)

4.3.6.2 Process and save results

# add cluster number from GO term clustering
results <- map_cluster_number(comp_filtered_non_variable,
                              df = df,
                              comp = TRUE
)

# Apply the function to each row of the DataFrame
results$GeneSymbol <- sapply(results$geneID, map_geneID_to_symbol)

# save results as csv file
write.csv(results, file = paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_specific_terms_only_nonvariable_result.csv"))

4.3.7 Filter for terms enriched in both variable and non-variable

 # get results as data frame
# extract results
results <- comp@compareClusterResult

# split by location
variable <- results[results$Cluster == "variable",]
non_variable <- results[results$Cluster == "non_variable",]

# calculate intersection of the two
unspecific_terms <- intersect(variable$ID, non_variable$ID)

# remove all unspecific terms
specific_terms <- results %>% filter(!ID %in% unspecific_terms)
unspecific_terms <- results %>% filter(ID %in% unspecific_terms)

# create a copy of comp
comp_filtered <- comp

# update results in comp
comp_filtered@compareClusterResult <- unspecific_terms

4.3.8 Plot dotplot of shared terms

# plot dotplot
dotplot(comp_filtered, showCategory = NULL) + ggtitle("GO BP Enrichment Comparison", subtitle = "qvalue < 0.01")  +
  scale_y_discrete(labels = function(x) str_wrap(x, width = 100))

# save dotplot as svg file
ggsave(paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_shared_terms_only.svg"), plot = last_plot(), device = "svg", width = 14, height = 10, limitsize = FALSE)

4.3.9 Cluster results - Both locations

# Split by location
variable <- comp_filtered@compareClusterResult[comp_filtered@compareClusterResult$Cluster == "variable", ]
non_variable <- comp_filtered@compareClusterResult[comp_filtered@compareClusterResult$Cluster == "non_variable", ]

# Create new compareClusterResult objects for each subset
comp_filtered_variable<- comp_filtered
comp_filtered_non_variable <- comp_filtered

comp_filtered_variable@compareClusterResult <- variable
comp_filtered_non_variable@compareClusterResult <- non_variable

4.3.9.1 Plot cluster heatmap

go_id = comp_filtered_variable@compareClusterResult$ID
mat = GO_similarity(go_id, 
                    ont = 'BP', 
                    db = 'org.Hs.eg.db', 
                    measure = "Sim_Relevance_2006"
                    )
# Capture the plot
heatmap_plot <- grid.grabExpr({
 df <- simplifyGO(mat,
             method = 'binary_cut',
             plot = TRUE,
             column_title = "GO BP terms significant in both locations",
             use_raster = FALSE,
             order_by_size = TRUE,
             fontsize_range = c(18, 36),
             max_words = 6,
             word_cloud_grob_param = list(col = 'black', 
                                          max_width = unit(200, "mm")))
})

# Save the captured plot as an SVG file
svg(paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_shared_terms_heatmap.svg"), width = 16, height = 9)
grid.draw(heatmap_plot)
dev.off()
## pdf 
##   2
grid.newpage()
grid.draw(heatmap_plot)

4.3.9.2 Process and save results

# add cluster number from GO term clustering
results <- map_cluster_number(comp_filtered_variable,
                              df = df,
                              comp = TRUE
)

# Apply the function to each row of the DataFrame
results$GeneSymbol <- sapply(results$geneID, map_geneID_to_symbol)

# save results as csv file
write.csv(results, file = paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_shared_terms_result_variable.csv"))

5 Session info

sessionInfo()
## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/Los_Angeles
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] readxl_1.4.5             simplifyEnrichment_2.2.0 VennDiagram_1.7.3       
##  [4] futile.logger_1.4.3      Cairo_1.6-2              org.Hs.eg.db_3.21.0     
##  [7] AnnotationDbi_1.70.0     IRanges_2.42.0           S4Vectors_0.46.0        
## [10] Biobase_2.68.0           BiocGenerics_0.54.0      generics_0.1.3          
## [13] enrichplot_1.28.2        clusterProfiler_4.16.0   lubridate_1.9.4         
## [16] forcats_1.0.0            stringr_1.5.1            dplyr_1.1.4             
## [19] purrr_1.0.4              readr_2.1.5              tidyr_1.3.1             
## [22] tibble_3.2.1             ggplot2_3.5.2            tidyverse_2.0.0         
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      rstudioapi_0.17.1       jsonlite_2.0.0         
##   [4] shape_1.4.6.1           magrittr_2.0.3          modeltools_0.2-24      
##   [7] ggtangle_0.0.6          farver_2.1.2            rmarkdown_2.29         
##  [10] ragg_1.4.0              GlobalOptions_0.1.2     fs_1.6.6               
##  [13] vctrs_0.6.5             memoise_2.0.1           ggtree_3.16.0          
##  [16] htmltools_0.5.8.1       lambda.r_1.2.4          cellranger_1.1.0       
##  [19] gridGraphics_0.5-1      sass_0.4.10             bslib_0.9.0            
##  [22] plyr_1.8.9              futile.options_1.0.1    cachem_1.1.0           
##  [25] igraph_2.1.4            mime_0.13               lifecycle_1.0.4        
##  [28] iterators_1.0.14        pkgconfig_2.0.3         Matrix_1.7-3           
##  [31] R6_2.6.1                fastmap_1.2.0           gson_0.1.0             
##  [34] GenomeInfoDbData_1.2.14 shiny_1.10.0            clue_0.3-66            
##  [37] digest_0.6.37           aplot_0.2.5             colorspace_2.1-1       
##  [40] patchwork_1.3.0         textshaping_1.0.1       RSQLite_2.3.11         
##  [43] labeling_0.4.3          timechange_0.3.0        httr_1.4.7             
##  [46] compiler_4.5.0          bit64_4.6.0-1           withr_3.0.2            
##  [49] doParallel_1.0.17       BiocParallel_1.42.0     DBI_1.2.3              
##  [52] R.utils_2.13.0          scatterplot3d_0.3-44    rjson_0.2.23           
##  [55] tools_4.5.0             ape_5.8-1               flexclust_1.5.0        
##  [58] httpuv_1.6.16           R.oo_1.27.1             glue_1.8.0             
##  [61] nlme_3.1-168            GOSemSim_2.34.0         promises_1.3.2         
##  [64] cluster_2.1.8.1         reshape2_1.4.4          fgsea_1.34.0           
##  [67] gtable_0.3.6            tzdb_0.5.0              class_7.3-23           
##  [70] R.methodsS3_1.8.2       data.table_1.17.0       hms_1.1.3              
##  [73] xml2_1.3.8              XVector_0.48.0          ggrepel_0.9.6          
##  [76] foreach_1.5.2           pillar_1.10.2           yulab.utils_0.2.0      
##  [79] later_1.4.2             circlize_0.4.16         splines_4.5.0          
##  [82] treeio_1.32.0           lattice_0.22-5          bit_4.6.0              
##  [85] tidyselect_1.2.1        GO.db_3.21.0            ComplexHeatmap_2.24.0  
##  [88] tm_0.7-16               Biostrings_2.76.0       knitr_1.50             
##  [91] NLP_0.3-2               svglite_2.1.3           xfun_0.52              
##  [94] matrixStats_1.5.0       stringi_1.8.7           UCSC.utils_1.4.0       
##  [97] lazyeval_0.2.2          ggfun_0.1.8             yaml_2.3.10            
## [100] evaluate_1.0.3          codetools_0.2-20        qvalue_2.40.0          
## [103] Polychrome_1.5.4        ggplotify_0.1.2         cli_3.6.5              
## [106] systemfonts_1.2.3       xtable_1.8-4            jquerylib_0.1.4        
## [109] Rcpp_1.0.14             GenomeInfoDb_1.44.0     png_0.1-8              
## [112] parallel_4.5.0          simona_1.6.0            blob_1.2.4             
## [115] DOSE_4.2.0              slam_0.1-55             tidytree_0.4.6         
## [118] scales_1.4.0            crayon_1.5.3            GetoptLong_1.0.5       
## [121] rlang_1.1.6             cowplot_1.1.3           fastmatch_1.1-6        
## [124] KEGGREST_1.48.0         formatR_1.14